// SPDX-License-Identifier: GPL-2.0
/*
 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
 *
 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 *  Interactivity improvements by Mike Galbraith
 *  (C) 2007 Mike Galbraith <efault@gmx.de>
 *
 *  Various enhancements by Dmitry Adamushko.
 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 *
 *  Group scheduling enhancements by Srivatsa Vaddagiri
 *  Copyright IBM Corporation, 2007
 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 *
 *  Scaled math optimizations by Thomas Gleixner
 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
 *
 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 */
#include "sched.h"

/*
 * Targeted preemption latency for CPU-bound tasks:
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
 *
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 *
 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_latency			= 6000000ULL;
static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;

/*
 * The initial- and re-scaling of tunables is configurable
 *
 * Options are:
 *
 *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
 *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
 *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 *
 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
 */
enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;

/*
 * Minimal preemption granularity for CPU-bound tasks:
 *
 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_min_granularity			= 750000ULL;
static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;

/*
 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
 */
static unsigned int sched_nr_latency = 8;

/*
 * After fork, child runs first. If set to 0 (default) then
 * parent will (try to) run first.
 */
unsigned int sysctl_sched_child_runs_first __read_mostly;

/*
 * SCHED_OTHER wake-up granularity.
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 *
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;

const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;

/*
 * The margin used when comparing utilization with CPU capacity:
 * util * margin < capacity * 1024
 *
 * (default: ~20%)
 */
static unsigned int capacity_margin			= 1280;

static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
    lw->weight += inc;
    lw->inv_weight = 0;
}

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
    lw->weight -= dec;
    lw->inv_weight = 0;
}

static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
    lw->weight = w;
    lw->inv_weight = 0;
}

/*
 * Increase the granularity value when there are more CPUs,
 * because with more CPUs the 'effective latency' as visible
 * to users decreases. But the relationship is not linear,
 * so pick a second-best guess by going with the log2 of the
 * number of CPUs.
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
static unsigned int get_update_sysctl_factor(void)
{
    unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
    unsigned int factor;

    switch (sysctl_sched_tunable_scaling) {
    case SCHED_TUNABLESCALING_NONE:
        factor = 1;
        break;
    case SCHED_TUNABLESCALING_LINEAR:
        factor = cpus;
        break;
    case SCHED_TUNABLESCALING_LOG:
    default:
        factor = 1 + ilog2(cpus);
        break;
    }

    return factor;
}

static void update_sysctl(void)
{
    unsigned int factor = get_update_sysctl_factor();

#define SET_SYSCTL(name) \
    (sysctl_##name = (factor) * normalized_sysctl_##name)
    SET_SYSCTL(sched_min_granularity);
    SET_SYSCTL(sched_latency);
    SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}

void sched_init_granularity(void)
{
    update_sysctl();
}

#define WMULT_CONST	(~0U)
#define WMULT_SHIFT	32

static void __update_inv_weight(struct load_weight *lw)
{
    unsigned long w;

    if (likely(lw->inv_weight))
        return;

    w = scale_load_down(lw->weight);

    if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
        lw->inv_weight = 1;
    else if (unlikely(!w))
        lw->inv_weight = WMULT_CONST;
    else
        lw->inv_weight = WMULT_CONST / w;
}

/*
 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 *
 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 *
 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 * weight/lw.weight <= 1, and therefore our shift will also be positive.
 */
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{
    u64 fact = scale_load_down(weight);
    int shift = WMULT_SHIFT;

    __update_inv_weight(lw);

    if (unlikely(fact >> 32)) {
        while (fact >> 32) {
            fact >>= 1;
            shift--;
        }
    }

    /* hint to use a 32x32->64 mul */
    fact = (u64)(u32)fact * lw->inv_weight;

    while (fact >> 32) {
        fact >>= 1;
        shift--;
    }

    return mul_u64_u32_shr(delta_exec, fact, shift);
}

/**************************************************************
 * CFS operations on generic schedulable entities:
 */

static inline struct task_struct *task_of(struct sched_entity *se)
{
    return container_of(se, struct task_struct, se);
}

static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
    return container_of(cfs_rq, struct rq, cfs);
}

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
    return &task_rq(p)->cfs;
}

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
    struct task_struct *p = task_of(se);
    struct rq *rq = task_rq(p);

    return &rq->cfs;
}

/**************************************************************
 * Scheduling class tree data structure manipulation methods:
 */

static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
    i64 delta = (i64)(vruntime - max_vruntime);
    if (delta > 0)
        max_vruntime = vruntime;

    return max_vruntime;
}

static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
    i64 delta = (i64)(vruntime - min_vruntime);
    if (delta < 0)
        min_vruntime = vruntime;

    return min_vruntime;
}

static inline int entity_before(struct sched_entity *a,
                struct sched_entity *b)
{
    return (i64)(a->vruntime - b->vruntime) < 0;
}

static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);

    u64 vruntime = cfs_rq->min_vruntime;

    if (curr) {
        if (curr->on_rq)
            vruntime = curr->vruntime;
        else
            curr = NULL;
    }

    if (leftmost) { /* non-empty tree */
        struct sched_entity *se;
        se = rb_entry(leftmost, struct sched_entity, run_node);

        if (!curr)
            vruntime = se->vruntime;
        else
            vruntime = min_vruntime(vruntime, se->vruntime);
    }

    /* ensure we never gain time by being placed backwards. */
    cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
    smp_wmb();
    cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
}

/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
    struct rb_node *parent = NULL;
    struct sched_entity *entry;
    bool leftmost = true;

    /*
     * Find the right place in the rbtree:
     */
    while (*link) {
        parent = *link;
        entry = rb_entry(parent, struct sched_entity, run_node);
        /*
         * We dont care about collisions. Nodes with
         * the same key stay together.
         */
        if (entity_before(se, entry)) {
            link = &parent->rb_left;
        } else {
            link = &parent->rb_right;
            leftmost = false;
        }
    }

    rb_link_node(&se->run_node, parent, link);
    rb_insert_color_cached(&se->run_node,
                   &cfs_rq->tasks_timeline, leftmost);
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
    struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);

    if (!left)
        return NULL;

    return rb_entry(left, struct sched_entity, run_node);
}

static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
    struct rb_node *next = rb_next(&se->run_node);

    if (!next)
        return NULL;

    return rb_entry(next, struct sched_entity, run_node);
}

/*
 * delta /= w
 */
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

    return delta;
}

/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sched_nr_latency) we have to stretch
 * this period because otherwise the slices get too small.
 *
 * p = (nr <= nl) ? l : l*nr/nl
 */
static u64 __sched_period(unsigned long nr_running)
{
    if (unlikely(nr_running > sched_nr_latency))
        return nr_running * sysctl_sched_min_granularity;
    else
        return sysctl_sched_latency;
}

/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
 * s = p*P[w/rw]
 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
    struct load_weight *load;
    struct load_weight lw;

    cfs_rq = cfs_rq_of(se);
    load = &cfs_rq->load;

    if (unlikely(!se->on_rq)) {
        lw = cfs_rq->load;
        update_load_add(&lw, se->load.weight);
        load = &lw;
    }
    return __calc_delta(slice, se->load.weight, load);
}

/*
 * We calculate the vruntime slice of a to-be-inserted task.
 *
 * vs = s/w
 */
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    return calc_delta_fair(sched_slice(cfs_rq, se), se);
}

#include "pelt.h"
#include "sched-pelt.h"

static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);

/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
    struct sched_avg *sa = &se->avg;

    memset(sa, 0, sizeof(*sa));

    sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
    se->runnable_weight = se->load.weight;

    /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static void attach_entity_cfs_rq(struct sched_entity *se);

/*
 * With new tasks being created, their initial util_avgs are extrapolated
 * based on the cfs_rq's current util_avg:
 *
 *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
 *
 * However, in many cases, the above util_avg does not give a desired
 * value. Moreover, the sum of the util_avgs may be divergent, such
 * as when the series is a harmonic series.
 *
 * To solve this problem, we also cap the util_avg of successive tasks to
 * only 1/2 of the left utilization budget:
 *
 *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 *
 * where n denotes the nth task and cpu_scale the CPU capacity.
 *
 * For example, for a CPU with 1024 of capacity, a simplest series from
 * the beginning would be like:
 *
 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
 *
 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
 * if util_avg > util_avg_cap.
 */
void post_init_entity_util_avg(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
    struct sched_avg *sa = &se->avg;
    long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
    long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
    struct task_struct *p;

    if (cap > 0) {
        if (cfs_rq->avg.util_avg != 0) {
            sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
            sa->util_avg /= (cfs_rq->avg.load_avg + 1);

            if (sa->util_avg > cap)
                sa->util_avg = cap;
        } else {
            sa->util_avg = cap;
        }
    }

    p = task_of(se);
    if (p->sched_class != &fair_sched_class) {
        /*
         * For !fair tasks do:
         *
         update_cfs_rq_load_avg(now, cfs_rq);
         attach_entity_load_avg(cfs_rq, se, 0);
         switched_from_fair(rq, p);
         *
         * such that the next switched_to_fair() has the
         * expected state.
         */
        se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
        return;
    }

    attach_entity_cfs_rq(se);
}

/*
 * Update the current task's runtime statistics.
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    u64 now = rq_clock_task(rq_of(cfs_rq));
    u64 delta_exec;

    if (unlikely(!curr))
        return;

    delta_exec = now - curr->exec_start;
    if (unlikely((i64)delta_exec <= 0))
        return;

    curr->exec_start = now;
    curr->sum_exec_runtime += delta_exec;
    curr->vruntime += calc_delta_fair(delta_exec, curr);
    update_min_vruntime(cfs_rq);
}

static void update_curr_fair(struct rq *rq)
{
    update_curr(cfs_rq_of(&rq->curr->se));
}

/*
 * We are picking a new current task - update its stats:
 */
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    /*
     * We are starting a new run period:
     */
    se->exec_start = rq_clock_task(rq_of(cfs_rq));
}

/**************************************************
 * Scheduling class queueing methods:
 */

static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    struct rq *rq;

    update_load_add(&cfs_rq->load, se->load.weight);
    update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
    rq = rq_of(cfs_rq);
    list_add(&se->group_node, &rq->cfs_tasks);
    cfs_rq->nr_running++;
}

static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    update_load_sub(&cfs_rq->load, se->load.weight);
    update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
    list_del_init(&se->group_node);
    cfs_rq->nr_running--;
}

/*
 * Signed add and clamp on underflow.
 *
 * Explicitly do a load-store to ensure the intermediate value never hits
 * memory. This allows lockless observations without ever seeing the negative
 * values.
 */
#define add_positive(_ptr, _val) do {                           \
    typeof(_ptr) ptr = (_ptr);                              \
    typeof(_val) val = (_val);                              \
    typeof(*ptr) res, var = READ_ONCE(*ptr);                \
                                \
    res = var + val;                                        \
                                \
    if (val < 0 && res > var)                               \
        res = 0;                                        \
                                \
    WRITE_ONCE(*ptr, res);                                  \
} while (0)

/*
 * Unsigned subtract and clamp on underflow.
 *
 * Explicitly do a load-store to ensure the intermediate value never hits
 * memory. This allows lockless observations without ever seeing the negative
 * values.
 */
#define sub_positive(_ptr, _val) do {				\
    typeof(_ptr) ptr = (_ptr);				\
    typeof(*ptr) val = (_val);				\
    typeof(*ptr) res, var = READ_ONCE(*ptr);		\
    res = var - val;					\
    if (res > var)						\
        res = 0;					\
    WRITE_ONCE(*ptr, res);					\
} while (0)

/*
 * Remove and clamp on negative, from a local variable.
 *
 * A variant of sub_positive(), which does not use explicit load-store
 * and is thus optimized for local variable updates.
 */
#define lsub_positive(_ptr, _val) do {				\
    typeof(_ptr) ptr = (_ptr);				\
    *ptr -= min_t(typeof(*ptr), *ptr, _val);		\
} while (0)

static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    cfs_rq->runnable_weight += se->runnable_weight;

    cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
    cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
}

static inline void
dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    cfs_rq->runnable_weight -= se->runnable_weight;

    sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
    sub_positive(&cfs_rq->avg.runnable_load_sum,
             se_runnable(se) * se->avg.runnable_load_sum);
}

static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    cfs_rq->avg.load_avg += se->avg.load_avg;
    cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
}

static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
    sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
}

static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                unsigned long weight, unsigned long runnable)
{
    if (se->on_rq) {
        /* commit outstanding execution time */
        if (cfs_rq->curr == se)
            update_curr(cfs_rq);
        account_entity_dequeue(cfs_rq, se);
        dequeue_runnable_load_avg(cfs_rq, se);
    }
    dequeue_load_avg(cfs_rq, se);

    se->runnable_weight = runnable;
    update_load_set(&se->load, weight);

    do {
        u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;

        se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
        se->avg.runnable_load_avg =
            div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
    } while (0);

    enqueue_load_avg(cfs_rq, se);
    if (se->on_rq) {
        account_entity_enqueue(cfs_rq, se);
        enqueue_runnable_load_avg(cfs_rq, se);
    }
}

void reweight_task(struct task_struct *p, int prio)
{
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
    struct load_weight *load = &se->load;
    unsigned long weight = scale_load(sched_prio_to_weight[prio]);

    reweight_entity(cfs_rq, se, weight, weight);
    load->inv_weight = sched_prio_to_wmult[prio];
}

static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
    struct rq *rq = rq_of(cfs_rq);

    if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
        /*
         * There are a few boundary cases this might miss but it should
         * get called often enough that that should (hopefully) not be
         * a real problem.
         *
         * It will not get called when we go idle, because the idle
         * thread is a different class (!fair), nor will the utilization
         * number include things like RT tasks.
         *
         * As is, the util number is not freq-invariant (we'd have to
         * implement arch_scale_freq_capacity() for that).
         *
         * See cpu_util().
         */
        cpufreq_update_util(rq, flags);
    }
}

/**
 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
 * @now: current time, as per cfs_rq_clock_task()
 * @cfs_rq: cfs_rq to update
 *
 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
 * avg. The immediate corollary is that all (fair) tasks must be attached, see
 * post_init_entity_util_avg().
 *
 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 *
 * Returns true if the load decayed or we removed load.
 *
 * Since both these conditions indicate a changed cfs_rq->avg.load we should
 * call update_tg_load_avg() when this function returns true.
 */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
    unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
    struct sched_avg *sa = &cfs_rq->avg;
    int decayed = 0;

    if (cfs_rq->removed.nr) {
        unsigned long r;
        u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;

        raw_spin_lock(&cfs_rq->removed.lock);
        swap(cfs_rq->removed.util_avg, removed_util);
        swap(cfs_rq->removed.load_avg, removed_load);
        swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
        cfs_rq->removed.nr = 0;
        raw_spin_unlock(&cfs_rq->removed.lock);

        r = removed_load;
        sub_positive(&sa->load_avg, r);
        sub_positive(&sa->load_sum, r * divider);

        r = removed_util;
        sub_positive(&sa->util_avg, r);
        sub_positive(&sa->util_sum, r * divider);

        decayed = 1;
    }

    decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);

#ifndef CONFIG_64BIT
    smp_wmb();
    cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

    if (decayed)
        cfs_rq_util_change(cfs_rq, 0);

    return decayed;
}

/**
 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
 * @cfs_rq: cfs_rq to attach to
 * @se: sched_entity to attach
 * @flags: migration hints
 *
 * Must call update_cfs_rq_load_avg() before this, since we rely on
 * cfs_rq->avg.last_update_time being current.
 */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;

    /*
     * When we attach the @se to the @cfs_rq, we must align the decay
     * window because without that, really weird and wonderful things can
     * happen.
     *
     * XXX illustrate
     */
    se->avg.last_update_time = cfs_rq->avg.last_update_time;
    se->avg.period_contrib = cfs_rq->avg.period_contrib;

    /*
     * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
     * period_contrib. This isn't strictly correct, but since we're
     * entirely outside of the PELT hierarchy, nobody cares if we truncate
     * _sum a little.
     */
    se->avg.util_sum = se->avg.util_avg * divider;

    se->avg.load_sum = divider;
    if (se_weight(se)) {
        se->avg.load_sum =
            div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
    }

    se->avg.runnable_load_sum = se->avg.load_sum;

    enqueue_load_avg(cfs_rq, se);
    cfs_rq->avg.util_avg += se->avg.util_avg;
    cfs_rq->avg.util_sum += se->avg.util_sum;

    cfs_rq_util_change(cfs_rq, flags);
}

/**
 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
 * @cfs_rq: cfs_rq to detach from
 * @se: sched_entity to detach
 *
 * Must call update_cfs_rq_load_avg() before this, since we rely on
 * cfs_rq->avg.last_update_time being current.
 */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    dequeue_load_avg(cfs_rq, se);
    sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
    sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);

    cfs_rq_util_change(cfs_rq, 0);
}

/*
 * Optional action to be done while updating the load average
 */
#define UPDATE_TG	0x1
#define SKIP_AGE_LOAD	0x2
#define DO_ATTACH	0x4

/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    u64 now = cfs_rq_clock_task(cfs_rq);
    struct rq *rq = rq_of(cfs_rq);
    int cpu = cpu_of(rq);
    int decayed;

    /*
     * Track task load average for carrying it to new CPU after migrated, and
     * track group sched_entity load average for task_h_load calc in migration
     */
    if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
        __update_load_avg_se(now, cpu, cfs_rq, se);

    decayed  = update_cfs_rq_load_avg(now, cfs_rq);

    if (!se->avg.last_update_time && (flags & DO_ATTACH)) {

        /*
         * DO_ATTACH means we're here from enqueue_entity().
         * !last_update_time means we've passed through
         * migrate_task_rq_fair() indicating we migrated.
         *
         * IOW we're enqueueing a task on a new CPU.
         */
        attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
    }
}

#ifndef CONFIG_64BIT
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
    u64 last_update_time_copy;
    u64 last_update_time;

    do {
        last_update_time_copy = cfs_rq->load_last_update_time_copy;
        smp_rmb();
        last_update_time = cfs_rq->avg.last_update_time;
    } while (last_update_time != last_update_time_copy);

    return last_update_time;
}
#else
static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
{
    return cfs_rq->avg.last_update_time;
}
#endif

/*
 * Synchronize entity load avg of dequeued entity without locking
 * the previous rq.
 */
void sync_entity_load_avg(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
    u64 last_update_time;

    last_update_time = cfs_rq_last_update_time(cfs_rq);
    __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
}

/*
 * Task first catches up with cfs_rq, and then subtract
 * itself from the cfs_rq (task must be off the queue now).
 */
void remove_entity_load_avg(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);
    unsigned long flags;

    /*
     * tasks cannot exit without having gone through wake_up_new_task() ->
     * post_init_entity_util_avg() which will have added things to the
     * cfs_rq, so we can remove unconditionally.
     *
     * Similarly for groups, they will have passed through
     * post_init_entity_util_avg() before unregister_sched_fair_group()
     * calls this.
     */

    sync_entity_load_avg(se);

    raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
    ++cfs_rq->removed.nr;
    cfs_rq->removed.util_avg	+= se->avg.util_avg;
    cfs_rq->removed.load_avg	+= se->avg.load_avg;
    cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
    raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{
    return cfs_rq->avg.runnable_load_avg;
}

static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
    return cfs_rq->avg.load_avg;
}

static inline unsigned long task_util(struct task_struct *p)
{
    return READ_ONCE(p->se.avg.util_avg);
}

static inline unsigned long _task_util_est(struct task_struct *p)
{
    struct util_est ue = READ_ONCE(p->se.avg.util_est);

    return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
}

static inline unsigned long task_util_est(struct task_struct *p)
{
    return max(task_util(p), _task_util_est(p));
}

static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                    struct task_struct *p)
{
    unsigned int enqueued;

    if (!sched_feat(UTIL_EST))
        return;

    /* Update root cfs_rq's estimated utilization */
    enqueued  = cfs_rq->avg.util_est.enqueued;
    enqueued += _task_util_est(p);
    WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
}

/*
 * Check if a (signed) value is within a specified (unsigned) margin,
 * based on the observation that:
 *
 *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
 *
 * NOTE: this only works when value + maring < INT_MAX.
 */
static inline bool within_margin(int value, int margin)
{
    return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}

static void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
    long last_ewma_diff;
    struct util_est ue;

    if (!sched_feat(UTIL_EST))
        return;

    /* Update root cfs_rq's estimated utilization */
    ue.enqueued  = cfs_rq->avg.util_est.enqueued;
    ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
    WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);

    /*
     * Skip update of task's estimated utilization when the task has not
     * yet completed an activation, e.g. being migrated.
     */
    if (!task_sleep)
        return;

    /*
     * If the PELT values haven't changed since enqueue time,
     * skip the util_est update.
     */
    ue = p->se.avg.util_est;
    if (ue.enqueued & UTIL_AVG_UNCHANGED)
        return;

    /*
     * Skip update of task's estimated utilization when its EWMA is
     * already ~1% close to its last activation value.
     */
    ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
    last_ewma_diff = ue.enqueued - ue.ewma;
    if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
        return;

    /*
     * Update Task's estimated utilization
     *
     * When *p completes an activation we can consolidate another sample
     * of the task size. This is done by storing the current PELT value
     * as ue.enqueued and by using this value to update the Exponential
     * Weighted Moving Average (EWMA):
     *
     *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
     *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
     *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
     *          = w * (      last_ewma_diff            ) +     ewma(t-1)
     *          = w * (last_ewma_diff  +  ewma(t-1) / w)
     *
     * Where 'w' is the weight of new samples, which is configured to be
     * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
     */
    ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
    ue.ewma  += last_ewma_diff;
    ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
    WRITE_ONCE(p->se.avg.util_est, ue);
}

static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
    return capacity * 1024 > task_util_est(p) * capacity_margin;
}

static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
    u64 vruntime = cfs_rq->min_vruntime;

    /*
     * The 'current' period is already promised to the current tasks,
     * however the extra weight of the new task will slow them down a
     * little, place the new task so that it fits in the slot that
     * stays open at the end.
     */
    if (initial && sched_feat(START_DEBIT))
        vruntime += sched_vslice(cfs_rq, se);

    /* sleeps up to a single latency don't count. */
    if (!initial) {
        unsigned long thresh = sysctl_sched_latency;

        /*
         * Halve their sleep time's effect, to allow
         * for a gentler effect of sleepers:
         */
        if (sched_feat(GENTLE_FAIR_SLEEPERS))
            thresh >>= 1;

        vruntime -= thresh;
    }

    /* ensure we never gain time by being placed backwards. */
    se->vruntime = max_vruntime(se->vruntime, vruntime);
}

/*
 * MIGRATION
 *
 *	dequeue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime -= min_vruntime
 *
 *	enqueue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime += min_vruntime
 *
 * this way the vruntime transition between RQs is done when both
 * min_vruntime are up-to-date.
 *
 * WAKEUP (remote)
 *
 *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
 *	  vruntime -= min_vruntime
 *
 *	enqueue
 *	  update_curr()
 *	    update_min_vruntime()
 *	  vruntime += min_vruntime
 *
 * this way we don't have the most up-to-date min_vruntime on the originating
 * CPU and an up-to-date min_vruntime on the destination CPU.
 */

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
    bool curr = cfs_rq->curr == se;

    /*
     * If we're the current task, we must renormalise before calling
     * update_curr().
     */
    if (renorm && curr)
        se->vruntime += cfs_rq->min_vruntime;

    update_curr(cfs_rq);

    /*
     * Otherwise, renormalise after, such that we're placed at the current
     * moment in time, instead of some random moment in the past. Being
     * placed in the past could significantly boost this task to the
     * fairness detriment of existing tasks.
     */
    if (renorm && !curr)
        se->vruntime += cfs_rq->min_vruntime;

    /*
     * When enqueuing a sched_entity, we must:
     *   - Update loads to have both entity and cfs_rq synced with now.
     *   - Add its load to cfs_rq->runnable_avg
     *   - For group_entity, update its weight to reflect the new share of
     *     its group cfs_rq
     *   - Add its new weight to cfs_rq->load.weight
     */
    update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
    enqueue_runnable_load_avg(cfs_rq, se);
    account_entity_enqueue(cfs_rq, se);

    if (flags & ENQUEUE_WAKEUP)
        place_entity(cfs_rq, se, 0);

    if (!curr)
        __enqueue_entity(cfs_rq, se);
    se->on_rq = 1;
}

static void __clear_buddies_last(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    if (cfs_rq->last != se)
        return;
    cfs_rq->last = NULL;
}

static void __clear_buddies_next(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    if (cfs_rq->next != se)
        return;
    cfs_rq->next = NULL;
}

static void __clear_buddies_skip(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    if (cfs_rq->skip != se)
        return;
    cfs_rq->skip = NULL;
}

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    if (cfs_rq->last == se)
        __clear_buddies_last(se);

    if (cfs_rq->next == se)
        __clear_buddies_next(se);

    if (cfs_rq->skip == se)
        __clear_buddies_skip(se);
}

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);

    /*
     * When dequeuing a sched_entity, we must:
     *   - Update loads to have both entity and cfs_rq synced with now.
     *   - Subtract its load from the cfs_rq->runnable_avg.
     *   - Subtract its previous weight from cfs_rq->load.weight.
     *   - For group entity, update its weight to reflect the new share
     *     of its group cfs_rq.
     */
    update_load_avg(cfs_rq, se, UPDATE_TG);
    dequeue_runnable_load_avg(cfs_rq, se);

    clear_buddies(cfs_rq, se);

    if (se != cfs_rq->curr)
        __dequeue_entity(cfs_rq, se);
    se->on_rq = 0;
    account_entity_dequeue(cfs_rq, se);

    /*
     * Normalize after update_curr(); which will also have moved
     * min_vruntime if @se is the one holding it back. But before doing
     * update_min_vruntime() again, which will discount @se's position and
     * can move min_vruntime forward still more.
     */
    if (!(flags & DEQUEUE_SLEEP))
        se->vruntime -= cfs_rq->min_vruntime;

    /*
     * Now advance min_vruntime if @se was the entity holding it back,
     * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
     * put back on, and if we advance min_vruntime, we'll be placed back
     * further than we started -- ie. we'll be penalized.
     */
    if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
        update_min_vruntime(cfs_rq);
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
    unsigned long ideal_runtime, delta_exec;
    struct sched_entity *se;
    i64 delta;

    ideal_runtime = sched_slice(cfs_rq, curr);
    delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
    if (delta_exec > ideal_runtime) {
        resched_curr(rq_of(cfs_rq));
        /*
         * The current task ran long enough, ensure it doesn't get
         * re-elected due to buddy favours.
         */
        clear_buddies(cfs_rq, curr);
        return;
    }

    /*
     * Ensure that a task that missed wakeup preemption by a
     * narrow margin doesn't have to wait for a full slice.
     * This also mitigates buddy induced latencies under load.
     */
    if (delta_exec < sysctl_sched_min_granularity)
        return;

    se = __pick_first_entity(cfs_rq);
    delta = curr->vruntime - se->vruntime;

    if (delta < 0)
        return;

    if (delta > ideal_runtime)
        resched_curr(rq_of(cfs_rq));
}

static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    /* 'current' is not kept within the tree. */
    if (se->on_rq) {
        /*
         * Any task has to be enqueued before it get to execute on
         * a CPU. So account for the time it spent waiting on the
         * runqueue.
         */
        __dequeue_entity(cfs_rq, se);
        update_load_avg(cfs_rq, se, UPDATE_TG);
    }

    update_stats_curr_start(cfs_rq, se);
    cfs_rq->curr = se;
    se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
    struct sched_entity *left = __pick_first_entity(cfs_rq);
    struct sched_entity *se;

    /*
     * If curr is set we have to see if its left of the leftmost entity
     * still in the tree, provided there was anything in the tree at all.
     */
    if (!left || (curr && entity_before(curr, left)))
        left = curr;

    se = left; /* ideally we run the leftmost entity */

    /*
     * Avoid running the skip buddy, if running something else can
     * be done without getting too unfair.
     */
    if (cfs_rq->skip == se) {
        struct sched_entity *second;

        if (se == curr) {
            second = __pick_first_entity(cfs_rq);
        } else {
            second = __pick_next_entity(se);
            if (!second || (curr && entity_before(curr, second)))
                second = curr;
        }

        if (second && wakeup_preempt_entity(second, left) < 1)
            se = second;
    }

    /*
     * Prefer last buddy, try to return the CPU to a preempted task.
     */
    if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
        se = cfs_rq->last;

    /*
     * Someone really wants this to run. If it's not unfair, run it.
     */
    if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
        se = cfs_rq->next;

    clear_buddies(cfs_rq, se);

    return se;
}

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
    /*
     * If still on the runqueue then deactivate_task()
     * was not called and update_curr() has to be done:
     */
    if (prev->on_rq)
        update_curr(cfs_rq);

    if (prev->on_rq) {
        /* Put 'current' back into the tree. */
        __enqueue_entity(cfs_rq, prev);
        /* in !on_rq case, update occurred at dequeue */
        update_load_avg(cfs_rq, prev, 0);
    }
    cfs_rq->curr = NULL;
}

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
    /*
     * Update run-time statistics of the 'current'.
     */
    update_curr(cfs_rq);

    /*
     * Ensure that runnable average is periodically updated.
     */
    update_load_avg(cfs_rq, curr, UPDATE_TG);

    /*
     * queued ticks are scheduled to match the slice, so don't bother
     * validating it and just reschedule.
     */
    if (queued) {
        resched_curr(rq_of(cfs_rq));
        return;
    }
    /*
     * don't let the period tick interfere with the hrtick preemption
     */
    if (!sched_feat(DOUBLE_TICK) &&
            hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
        return;

    if (cfs_rq->nr_running > 1)
        check_preempt_tick(cfs_rq, curr);
}

static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
    return rq_clock_task(rq_of(cfs_rq));
}

/**************************************************
 * CFS operations on tasks:
 */
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    SCHED_WARN_ON(task_rq(p) != rq);

    if (rq->cfs.h_nr_running > 1) {
        u64 slice = sched_slice(cfs_rq, se);
        u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
        i64 delta = slice - ran;

        if (delta < 0) {
            if (rq->curr == p)
                resched_curr(rq);
            return;
        }
        hrtick_start(rq, delta);
    }
}

/*
 * called from enqueue/dequeue and updates the hrtick when the
 * current task is from our class and nr_running is low enough
 * to matter.
 */
static void hrtick_update(struct rq *rq)
{
    struct task_struct *curr = rq->curr;

    if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
        return;

    if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
        hrtick_start_fair(rq, curr);
}

static inline unsigned long cpu_util(int cpu);
static unsigned long capacity_of(int cpu);

static inline bool cpu_overutilized(int cpu)
{
    return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
}

static inline void update_overutilized_status(struct rq *rq)
{
    if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
        WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
}

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se;

    /*
     * The code below (indirectly) updates schedutil which looks at
     * the cfs_rq utilization to select a frequency.
     * Let's add the task's estimated utilization to the cfs_rq's
     * estimated utilization, before we update schedutil.
     */
    util_est_enqueue(&rq->cfs, p);

    /*
     * If in_iowait is set, the code below may not trigger any cpufreq
     * utilization updates, so do it here explicitly with the IOWAIT flag
     * passed.
     */
    if (p->in_iowait)
        cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

    if (!se->on_rq) {
        cfs_rq = cfs_rq_of(se);
        enqueue_entity(cfs_rq, se, flags);

        cfs_rq->h_nr_running++;
        flags = ENQUEUE_WAKEUP;
    }

    cfs_rq = cfs_rq_of(se);
    cfs_rq->h_nr_running++;
    update_load_avg(cfs_rq, se, UPDATE_TG);

    if (!se) {
        add_nr_running(rq, 1);
        /*
         * Since new tasks are assigned an initial util_avg equal to
         * half of the spare capacity of their CPU, tiny tasks have the
         * ability to cross the overutilized threshold, which will
         * result in the load balancer ruining all the task placement
         * done by EAS. As a way to mitigate that effect, do not account
         * for the first enqueue operation of new tasks during the
         * overutilized flag detection.
         *
         * A better way of solving this problem would be to wait for
         * the PELT signals of tasks to converge before taking them
         * into account, but that is not straightforward to implement,
         * and the following generally works well enough in practice.
         */
        if (flags & ENQUEUE_WAKEUP)
            update_overutilized_status(rq);

    }

    hrtick_update(rq);
}

static void set_next_buddy(struct sched_entity *se);

/*
 * The dequeue_task method is called before nr_running is
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se;
    int task_sleep = flags & DEQUEUE_SLEEP;

    cfs_rq = cfs_rq_of(se);
    dequeue_entity(cfs_rq, se, flags);

    cfs_rq->h_nr_running--;

    /* Don't dequeue parent if it has other entities besides us */
    if (!cfs_rq->load.weight)
        flags |= DEQUEUE_SLEEP;

    cfs_rq = cfs_rq_of(se);
    cfs_rq->h_nr_running--;

    update_load_avg(cfs_rq, se, UPDATE_TG);

    if (!se)
        sub_nr_running(rq, 1);

    util_est_dequeue(&rq->cfs, p, task_sleep);
    hrtick_update(rq);
}

#define DEGRADE_SHIFT		7

static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
    {   0,   0,  0,  0,  0,  0, 0, 0 },
    {  64,  32,  8,  0,  0,  0, 0, 0 },
    {  96,  72, 40, 12,  1,  0, 0, 0 },
    { 112,  98, 75, 43, 15,  1, 0, 0 },
    { 120, 112, 98, 76, 45, 16, 2, 0 }
};

/*
 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
 * would be when CPU is idle and so we just decay the old load without
 * adding any new load.
 */
static unsigned long
decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
{
    int j = 0;

    if (!missed_updates)
        return load;

    if (missed_updates >= degrade_zero_ticks[idx])
        return 0;

    if (idx == 1)
        return load >> missed_updates;

    while (missed_updates) {
        if (missed_updates % 2)
            load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;

        missed_updates >>= 1;
        j++;
    }
    return load;
}

/**
 * __cpu_load_update - update the rq->cpu_load[] statistics
 * @this_rq: The rq to update statistics for
 * @this_load: The current load
 * @pending_updates: The number of missed updates
 *
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC).
 *
 * This function computes a decaying average:
 *
 *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
 *
 * Because of NOHZ it might not get called on every tick which gives need for
 * the @pending_updates argument.
 *
 *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
 *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
 *             = A * (A * load[i]_n-2 + B) + B
 *             = A * (A * (A * load[i]_n-3 + B) + B) + B
 *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
 *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
 *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
 *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
 *
 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
 * any change in load would have resulted in the tick being turned back on.
 *
 * For regular NOHZ, this reduces to:
 *
 *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
 *
 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
 * term.
 */
static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
                unsigned long pending_updates)
{
    unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
    int i, scale;

    this_rq->nr_load_updates++;

    /* Update our load: */
    this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
    for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
        unsigned long old_load, new_load;

        /* scale is effectively 1 << i now, and >> i divides by scale */

        old_load = this_rq->cpu_load[i];
        old_load = decay_load_missed(old_load, pending_updates - 1, i);
        if (tickless_load) {
            old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
            /*
             * old_load can never be a negative value because a
             * decayed tickless_load cannot be greater than the
             * original tickless_load.
             */
            old_load += tickless_load;
        }
        new_load = this_load;
        /*
         * Round up the averaging division if load is increasing. This
         * prevents us from getting stuck on 9 if the load is 10, for
         * example.
         */
        if (new_load > old_load)
            new_load += scale - 1;

        this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
    }
}

/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(struct rq *rq)
{
    return cfs_rq_runnable_load_avg(&rq->cfs);
}

static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
{
    /* See the mess around cpu_load_update_nohz(). */
    this_rq->last_load_update_tick = READ_ONCE(jiffies_64);
    cpu_load_update(this_rq, load, 1);
}

/*
 * Called from scheduler_tick()
 */
void cpu_load_update_active(struct rq *this_rq)
{
    unsigned long load = weighted_cpuload(this_rq);

    cpu_load_update_periodic(this_rq, load);
}

/*
 * Return a low guess at the load of a migration-source CPU weighted
 * according to the scheduling class and "nice" value.
 *
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
static unsigned long source_load(int cpu, int type)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long total = weighted_cpuload(rq);

    if (type == 0 || !sched_feat(LB_BIAS))
        return total;

    return min(rq->cpu_load[type-1], total);
}

/*
 * Return a high guess at the load of a migration-target CPU weighted
 * according to the scheduling class and "nice" value.
 */
static unsigned long target_load(int cpu, int type)
{
    struct rq *rq = cpu_rq(cpu);
    unsigned long total = weighted_cpuload(rq);

    if (type == 0 || !sched_feat(LB_BIAS))
        return total;

    return max(rq->cpu_load[type-1], total);
}

static unsigned long capacity_of(int cpu)
{
    return cpu_rq(cpu)->cpu_capacity;
}

static unsigned long capacity_orig_of(int cpu)
{
    return cpu_rq(cpu)->cpu_capacity_orig;
}

/*
 * The purpose of wake_affine() is to quickly determine on which CPU we can run
 * soonest. For the purpose of speed we only consider the waking and previous
 * CPU.
 *
 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
 *			cache-affine and is (or	will be) idle.
 *
 * wake_affine_weight() - considers the weight to reflect the average
 *			  scheduling latency of the CPUs. This seems to work
 *			  for the overloaded case.
 */
static int
wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
    /*
     * If this_cpu is idle, it implies the wakeup is from interrupt
     * context. Only allow the move if cache is shared. Otherwise an
     * interrupt intensive workload could force all tasks onto one
     * node depending on the IO topology or IRQ affinity settings.
     *
     * If the prev_cpu is idle and cache affine then avoid a migration.
     * There is no guarantee that the cache hot data from an interrupt
     * is more important than cache hot data on the prev_cpu and from
     * a cpufreq perspective, it's better to have higher utilisation
     * on one CPU.
     */
    if (available_idle_cpu(this_cpu))
        return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;

    if (sync && cpu_rq(this_cpu)->nr_running == 1)
        return this_cpu;

    return nr_cpumask_bits;
}

static int
wake_affine_weight(struct task_struct *p, int this_cpu, int prev_cpu, int sync)
{
    i64 this_eff_load, prev_eff_load;
    unsigned long task_load;

    this_eff_load = target_load(this_cpu, 0);

    if (sync) {
        unsigned long current_load = task_h_load(current);

        if (current_load > this_eff_load)
            return this_cpu;

        this_eff_load -= current_load;
    }

    task_load = task_h_load(p);

    this_eff_load += task_load;
    if (sched_feat(WA_BIAS))
        this_eff_load *= 100;
    this_eff_load *= capacity_of(prev_cpu);

    prev_eff_load = source_load(prev_cpu, 0);
    prev_eff_load -= task_load;
    if (sched_feat(WA_BIAS))
        prev_eff_load *= 100 + 25 / 2;
    prev_eff_load *= capacity_of(this_cpu);

    /*
     * If sync, adjust the weight of prev_eff_load such that if
     * prev_eff == this_eff that select_idle_sibling() will consider
     * stacking the wakee on top of the waker if no other CPU is
     * idle.
     */
    if (sync)
        prev_eff_load += 1;

    return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}

static void attach_one_task(struct rq *rq, struct task_struct *p);
static struct task_struct *detach_one_task(struct rq *busy_rq, struct rq *idle_rq);

/*
 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 * running tasks off the busiest CPU onto idle CPUs. It requires at
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
 */
static int active_load_balance_cpu_stop(void *data)
{
    struct rq *busiest_rq = data;
    int busiest_cpu = cpu_of(busiest_rq);
    int target_cpu = smp_processor_id();
    struct rq *target_rq = cpu_rq(target_cpu);
    struct task_struct *p = NULL;
    struct rq_flags rf;

    rq_lock_irq(busiest_rq, &rf);

    if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu))
        goto out_unlock;

    if (busiest_rq->nr_running <= 1)
        goto out_unlock;

    BUG_ON(busiest_rq == target_rq);

    /* Search for an sd spanning us and the target CPU. */
    rcu_read_lock();
    update_rq_clock(busiest_rq);
    p = detach_one_task(busiest_rq, target_rq);
    rcu_read_unlock();
out_unlock:
    rq_unlock(busiest_rq, &rf);

    if (p)
        attach_one_task(target_rq, p);

    local_irq_enable();

    return 0;
}

static inline void update_blocked_averages(int cpu);

void trigger_load_balance(struct rq *rq)
{
    int cpu = rq->cpu;
    i64 this_max_load = target_load(cpu, 0);
    i64 this_min_load = source_load(cpu, 0);
    int busy_cpu, idle_cpu, need_balance = 0;

    if (sched_feat(WA_BIAS))
        this_max_load *= 100;
    this_max_load *= capacity_of(cpu);

    if (sched_feat(WA_BIAS))
        this_min_load *= 100 + 25 / 2;
    this_min_load *= capacity_of(cpu);

    spin_lock(&rq->rd->loadavg_lock);
    if (rq->rd->max_loadavg < this_max_load) {
        rq->rd->max_loadavg = this_max_load;
        rq->rd->max_loadavg_cpu = cpu;
    } else if (rq->rd->min_loadavg > this_min_load) {
        rq->rd->min_loadavg = this_min_load;
        rq->rd->min_loadavg_cpu = cpu;
    }

    if (rq->rd->max_loadavg - rq->rd->min_loadavg > 1000) {
        need_balance = 1;
        idle_cpu = rq->rd->min_loadavg_cpu;
        busy_cpu = rq->rd->max_loadavg_cpu;
    }
    spin_unlock(&rq->rd->loadavg_lock);

    if (need_balance) {
        update_blocked_averages(cpu);
        if (smp_processor_id() == busy_cpu)
            stop_one_cpu_nowait(idle_cpu, active_load_balance_cpu_stop, rq, &rq->active_balance_work);
    }
}

static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
{
    int this_cpu = this_rq->cpu;
    int busy_cpu;

    /*
     * We must set idle_stamp _before_ calling idle_balance(), such that we
     * measure the duration of idle_balance() as idle time.
     */
    this_rq->idle_stamp = rq_clock(this_rq);

    if (!cpu_online(this_cpu))
        return 0;

    /*
     * This is OK, because current is on_cpu, which avoids it being picked
     * for load-balance and preemption/IRQs are still disabled avoiding
     * further scheduler activity on it and we're being very careful to
     * re-start the picking loop.
     */
    rq_unpin_lock(this_rq, rf);

    if (this_rq->avg_idle < sysctl_sched_migration_cost ||
        !READ_ONCE(this_rq->rd->overload))
        goto out;

    raw_spin_unlock(&this_rq->lock);
    update_blocked_averages(this_cpu);
    rcu_read_lock();
    spin_lock(&this_rq->rd->loadavg_lock);
    busy_cpu = this_rq->rd->max_loadavg_cpu;
    spin_unlock(&this_rq->rd->loadavg_lock);
    stop_one_cpu_nowait(this_cpu, active_load_balance_cpu_stop, cpu_rq(busy_cpu), &cpu_rq(busy_cpu)->active_balance_work);
    rcu_read_unlock();

    raw_spin_lock(&this_rq->lock);
out:
    rq_repin_lock(this_rq, rf);

    return 1;
}

/**
 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
 * @cpu: the CPU to get the utilization of
 *
 * The unit of the return value must be the one of capacity so we can compare
 * the utilization with the capacity of the CPU that is available for CFS task
 * (ie cpu_capacity).
 *
 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
 * recent utilization of currently non-runnable tasks on a CPU. It represents
 * the amount of utilization of a CPU in the range [0..capacity_orig] where
 * capacity_orig is the cpu_capacity available at the highest frequency
 * (arch_scale_freq_capacity()).
 * The utilization of a CPU converges towards a sum equal to or less than the
 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
 * the running time on this CPU scaled by capacity_curr.
 *
 * The estimated utilization of a CPU is defined to be the maximum between its
 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
 * currently RUNNABLE on that CPU.
 * This allows to properly represent the expected utilization of a CPU which
 * has just got a big task running since a long sleep period. At the same time
 * however it preserves the benefits of the "blocked utilization" in
 * describing the potential for other tasks waking up on the same CPU.
 *
 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
 * higher than capacity_orig because of unfortunate rounding in
 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
 * the average stabilizes with the new running time. We need to check that the
 * utilization stays within the range of [0..capacity_orig] and cap it if
 * necessary. Without utilization capping, a group could be seen as overloaded
 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
 * available capacity. We allow utilization to overshoot capacity_curr (but not
 * capacity_orig) as it useful for predicting the capacity required after task
 * migrations (scheduler-driven DVFS).
 *
 * Return: the (estimated) utilization for the specified CPU
 */
static inline unsigned long cpu_util(int cpu)
{
    struct cfs_rq *cfs_rq;
    unsigned int util;

    cfs_rq = &cpu_rq(cpu)->cfs;
    util = READ_ONCE(cfs_rq->avg.util_avg);

    if (sched_feat(UTIL_EST))
        util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));

    return min_t(unsigned long, util, capacity_orig_of(cpu));
}

static int wake_affine(struct task_struct *p, int this_cpu, int prev_cpu, int sync)
{
    int target = nr_cpumask_bits;

    if (sched_feat(WA_IDLE))
        target = wake_affine_idle(this_cpu, prev_cpu, sync);

    if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
        target = wake_affine_weight(p, this_cpu, prev_cpu, sync);

    if (target == nr_cpumask_bits)
        return prev_cpu;

    return target;
}

/*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
 * Balances load by selecting the idlest CPU in the idlest group, or under
 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 *
 * Returns the target CPU number.
 *
 * preempt must be disabled.
 */
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
    int cpu = smp_processor_id();
    int new_cpu = prev_cpu;
    int want_affine = 0;
    int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);

    want_affine = cpumask_test_cpu(cpu, &p->cpus_allowed);

    rcu_read_lock();
    for_each_online_cpu(cpu) {
        /*
         * If both 'cpu' and 'prev_cpu' are part of this domain,
         * cpu is a valid SD_WAKE_AFFINE target.
         */
        if (want_affine) {
            if (cpu != prev_cpu)
                new_cpu = wake_affine(p, cpu, prev_cpu, sync);
            break;
        }
        if (!want_affine)
            break;
    }
    rcu_read_unlock();

    return new_cpu;
}

static void detach_entity_cfs_rq(struct sched_entity *se);

/*
 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
 * cfs_rq_of(p) references at time of call are still valid and identify the
 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 */
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
    /*
     * As blocked tasks retain absolute vruntime the migration needs to
     * deal with this by subtracting the old and adding the new
     * min_vruntime -- the latter is done by enqueue_entity() when placing
     * the task on the new runqueue.
     */
    if (p->state == TASK_WAKING) {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 min_vruntime;

#ifndef CONFIG_64BIT
        u64 min_vruntime_copy;

        do {
            min_vruntime_copy = cfs_rq->min_vruntime_copy;
            smp_rmb();
            min_vruntime = cfs_rq->min_vruntime;
        } while (min_vruntime != min_vruntime_copy);
#else
        min_vruntime = cfs_rq->min_vruntime;
#endif

        se->vruntime -= min_vruntime;
    }

    if (p->on_rq == TASK_ON_RQ_MIGRATING)
        detach_entity_cfs_rq(&p->se);
    else
        remove_entity_load_avg(&p->se);

    /* Tell new CPU we are migrated */
    p->se.avg.last_update_time = 0;

    /* We have migrated, no longer consider this task hot */
    p->se.exec_start = 0;
}

static void task_dead_fair(struct task_struct *p)
{
    remove_entity_load_avg(&p->se);
}

static unsigned long wakeup_gran(struct sched_entity *se)
{
    unsigned long gran = sysctl_sched_wakeup_granularity;

    /*
     * Since its curr running now, convert the gran from real-time
     * to virtual-time in his units.
     *
     * By using 'se' instead of 'curr' we penalize light tasks, so
     * they get preempted easier. That is, if 'se' < 'curr' then
     * the resulting gran will be larger, therefore penalizing the
     * lighter, if otoh 'se' > 'curr' then the resulting gran will
     * be smaller, again penalizing the lighter task.
     *
     * This is especially important for buddies when the leftmost
     * task is higher priority than the buddy.
     */
    return calc_delta_fair(gran, se);
}

/*
 * Should 'se' preempt 'curr'.
 *
 *             |s1
 *        |s2
 *   |s3
 *         g
 *      |<--->|c
 *
 *  w(c, s1) = -1
 *  w(c, s2) =  0
 *  w(c, s3) =  1
 *
 */
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
{
    i64 gran, vdiff = curr->vruntime - se->vruntime;

    if (vdiff <= 0)
        return -1;

    gran = wakeup_gran(se);
    if (vdiff > gran)
        return 1;

    return 0;
}

static void set_last_buddy(struct sched_entity *se)
{
    if (unlikely(task_has_idle_policy(task_of(se))))
        return;

    if (SCHED_WARN_ON(!se->on_rq))
        return;
    cfs_rq_of(se)->last = se;
}

static void set_next_buddy(struct sched_entity *se)
{
    if (unlikely(task_has_idle_policy(task_of(se))))
        return;

    if (SCHED_WARN_ON(!se->on_rq))
        return;
    cfs_rq_of(se)->next = se;
}

static void set_skip_buddy(struct sched_entity *se)
{
    cfs_rq_of(se)->skip = se;
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
    struct task_struct *curr = rq->curr;
    struct sched_entity *se = &curr->se, *pse = &p->se;
    struct cfs_rq *cfs_rq = task_cfs_rq(curr);
    int scale = cfs_rq->nr_running >= sched_nr_latency;
    int next_buddy_marked = 0;

    if (unlikely(se == pse))
        return;

    if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
        set_next_buddy(pse);
        next_buddy_marked = 1;
    }

    /*
     * We can come here with TIF_NEED_RESCHED already set from new task
     * wake up path.
     *
     * Note: this also catches the edge-case of curr being in a throttled
     * group (e.g. via set_curr_task), since update_curr() (in the
     * enqueue of curr) will have resulted in resched being set.  This
     * prevents us from potentially nominating it as a false LAST_BUDDY
     * below.
     */
    if (test_tsk_need_resched(curr))
        return;

    /* Idle tasks are by definition preempted by non-idle tasks. */
    if (unlikely(task_has_idle_policy(curr)) &&
        likely(!task_has_idle_policy(p)))
        goto preempt;

    /*
     * Batch and idle tasks do not preempt non-idle tasks (their preemption
     * is driven by the tick):
     */
    if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
        return;

    update_curr(cfs_rq_of(se));
    BUG_ON(!pse);
    if (wakeup_preempt_entity(se, pse) == 1) {
        /*
         * Bias pick_next to pick the sched entity that is
         * triggering this preemption.
         */
        if (!next_buddy_marked)
            set_next_buddy(pse);
        goto preempt;
    }

    return;

preempt:
    resched_curr(rq);
    /*
     * Only set the backward buddy when the current task is still
     * on the rq. This can happen when a wakeup gets interleaved
     * with schedule on the ->pre_schedule() or idle_balance()
     * point, either of which can * drop the rq lock.
     *
     * Also, during early boot the idle thread is in the fair class,
     * for obvious reasons its a bad idea to schedule back to it.
     */
    if (unlikely(!se->on_rq || curr == rq->idle))
        return;

    if (sched_feat(LAST_BUDDY) && scale)
        set_last_buddy(se);
}

static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
    struct cfs_rq *cfs_rq = &rq->cfs;
    struct sched_entity *se;
    struct task_struct *p;
    int new_tasks;

again:
    if (!cfs_rq->nr_running)
        goto idle;

    put_prev_task(rq, prev);

    se = pick_next_entity(cfs_rq, NULL);
    set_next_entity(cfs_rq, se);

    p = task_of(se);

done: __maybe_unused;
    /*
     * Move the next running task to the front of
     * the list, so our cfs_tasks list becomes MRU
     * one.
     */
    list_move(&p->se.group_node, &rq->cfs_tasks);

    if (hrtick_enabled(rq))
        hrtick_start_fair(rq, p);

    return p;

idle:
    new_tasks = idle_balance(rq, rf);

    /*
     * Because idle_balance() releases (and re-acquires) rq->lock, it is
     * possible for any higher priority task to appear. In that case we
     * must re-start the pick_next_entity() loop.
     */
    if (new_tasks < 0)
        return RETRY_TASK;

    if (new_tasks > 0)
        goto again;

    return NULL;
}

/*
 * Account for a descheduled task:
 */
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
    struct sched_entity *se = &prev->se;
    struct cfs_rq *cfs_rq;

    cfs_rq = cfs_rq_of(se);
    put_prev_entity(cfs_rq, se);
}

/*
 * sched_yield() is very simple
 *
 * The magic of dealing with the ->skip buddy is in pick_next_entity.
 */
static void yield_task_fair(struct rq *rq)
{
    struct task_struct *curr = rq->curr;
    struct cfs_rq *cfs_rq = task_cfs_rq(curr);
    struct sched_entity *se = &curr->se;

    /*
     * Are we the only task in the tree?
     */
    if (unlikely(rq->nr_running == 1))
        return;

    clear_buddies(cfs_rq, se);

    if (curr->policy != SCHED_BATCH) {
        update_rq_clock(rq);
        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
        /*
         * Tell update_rq_clock() that we've just updated,
         * so we don't do microscopic update in schedule()
         * and double the fastpath cost.
         */
        rq_clock_skip_update(rq);
    }

    set_skip_buddy(se);
}

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
{
    struct sched_entity *se = &p->se;

    /* throttled hierarchies are not runnable */
    if (!se->on_rq)
        return false;

    /* Tell the scheduler that we'd really like pse to run next. */
    set_next_buddy(se);

    yield_task_fair(rq);

    return true;
}

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
static
int can_migrate_task(struct task_struct *p, struct rq *busy_rq, struct rq *idle_rq)
{
    if (!cpumask_test_cpu(idle_rq->cpu, &p->cpus_allowed))
        return 0;

    if (task_running(busy_rq, p))
        return 0;

    return 1;
}

/*
 * detach_task() -- detach the task for the migration specified in env
 */
static void detach_task(struct task_struct *p, struct rq *busy_rq, struct rq *idle_rq)
{
    p->on_rq = TASK_ON_RQ_MIGRATING;
    deactivate_task(busy_rq, p, DEQUEUE_NOCLOCK);
    set_task_cpu(p, idle_rq->cpu);
}

/*
 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 * part of active balancing operations within "domain".
 *
 * Returns a task if successful and NULL otherwise.
 */
static struct task_struct *detach_one_task(struct rq *busy_rq, struct rq *idle_rq)
{
    struct task_struct *p;

    list_for_each_entry_reverse(p,
            &busy_rq->cfs_tasks, se.group_node) {
        if (!can_migrate_task(p, busy_rq, idle_rq))
            continue;

        detach_task(p, busy_rq, idle_rq);

        return p;
    }
    return NULL;
}

/*
 * attach_task() -- attach the task detached by detach_task() to its new rq.
 */
static void attach_task(struct rq *rq, struct task_struct *p)
{
    BUG_ON(task_rq(p) != rq);
    activate_task(rq, p, ENQUEUE_NOCLOCK);
    p->on_rq = TASK_ON_RQ_QUEUED;
    check_preempt_curr(rq, p, 0);
}

/*
 * attach_one_task() -- attaches the task returned from detach_one_task() to
 * its new rq.
 */
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
    struct rq_flags rf;

    rq_lock(rq, &rf);
    update_rq_clock(rq);
    attach_task(rq, p);
    rq_unlock(rq, &rf);
}

static inline void update_blocked_averages(int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    struct cfs_rq *cfs_rq = &rq->cfs;
    const struct sched_class *curr_class;
    struct rq_flags rf;

    rq_lock_irqsave(rq, &rf);
    update_rq_clock(rq);
    update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);

    curr_class = rq->curr->sched_class;
    update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
    update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
    rq->last_blocked_load_update_tick = jiffies_64;
    rq_unlock_irqrestore(rq, &rf);
}

static unsigned long task_h_load(struct task_struct *p)
{
    return p->se.avg.load_avg;
}

static void rq_online_fair(struct rq *rq)
{
    update_sysctl();
}

static void rq_offline_fair(struct rq *rq)
{
    update_sysctl();
}

/*
 * scheduler tick hitting a task of our scheduling class.
 *
 * NOTE: This function can be called remotely by the tick offload that
 * goes along full dynticks. Therefore no local assumption can be made
 * and everything must be accessed through the @rq and @curr passed in
 * parameters.
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &curr->se;

    cfs_rq = cfs_rq_of(se);
    entity_tick(cfs_rq, se, queued);

    update_overutilized_status(task_rq(curr));
}

/*
 * called on fork with the child task as argument from the parent's context
 *  - child not yet on the tasklist
 *  - preemption disabled
 */
static void task_fork_fair(struct task_struct *p)
{
    struct cfs_rq *cfs_rq;
    struct sched_entity *se = &p->se, *curr;
    struct rq *rq = this_rq();
    struct rq_flags rf;

    rq_lock(rq, &rf);
    update_rq_clock(rq);

    cfs_rq = task_cfs_rq(current);
    curr = cfs_rq->curr;
    if (curr) {
        update_curr(cfs_rq);
        se->vruntime = curr->vruntime;
    }
    place_entity(cfs_rq, se, 1);

    if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
        /*
         * Upon rescheduling, sched_class::put_prev_task() will place
         * 'current' within the tree based on its new key value.
         */
        swap(curr->vruntime, se->vruntime);
        resched_curr(rq);
    }

    se->vruntime -= cfs_rq->min_vruntime;
    rq_unlock(rq, &rf);
}

/*
 * Priority of the task has changed. Check to see if we preempt
 * the current task.
 */
static void
prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
    if (!task_on_rq_queued(p))
        return;

    /*
     * Reschedule if we are currently running on this runqueue and
     * our priority decreased, or if we are not currently running on
     * this runqueue and our priority is higher than the current's
     */
    if (rq->curr == p) {
        if (p->prio > oldprio)
            resched_curr(rq);
    } else
        check_preempt_curr(rq, p, 0);
}

static inline bool vruntime_normalized(struct task_struct *p)
{
    struct sched_entity *se = &p->se;

    /*
     * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
     * the dequeue_entity(.flags=0) will already have normalized the
     * vruntime.
     */
    if (p->on_rq)
        return true;

    /*
     * When !on_rq, vruntime of the task has usually NOT been normalized.
     * But there are some cases where it has already been normalized:
     *
     * - A forked child which is waiting for being woken up by
     *   wake_up_new_task().
     * - A task which has been woken up by try_to_wake_up() and
     *   waiting for actually being woken up by sched_ttwu_pending().
     */
    if (!se->sum_exec_runtime ||
        (p->state == TASK_WAKING && p->sched_remote_wakeup))
        return true;

    return false;
}

static void detach_entity_cfs_rq(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    /* Catch up with the cfs_rq and remove our load when we leave */
    update_load_avg(cfs_rq, se, 0);
    detach_entity_load_avg(cfs_rq, se);
}

static void attach_entity_cfs_rq(struct sched_entity *se)
{
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    /* Synchronize entity with its cfs_rq */
    update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
    attach_entity_load_avg(cfs_rq, se, 0);
}

static void detach_task_cfs_rq(struct task_struct *p)
{
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    if (!vruntime_normalized(p)) {
        /*
         * Fix up our vruntime so that the current sleep doesn't
         * cause 'unlimited' sleep bonus.
         */
        place_entity(cfs_rq, se, 0);
        se->vruntime -= cfs_rq->min_vruntime;
    }

    detach_entity_cfs_rq(se);
}

static void attach_task_cfs_rq(struct task_struct *p)
{
    struct sched_entity *se = &p->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    attach_entity_cfs_rq(se);

    if (!vruntime_normalized(p))
        se->vruntime += cfs_rq->min_vruntime;
}

static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
    detach_task_cfs_rq(p);
}

static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
    attach_task_cfs_rq(p);

    if (task_on_rq_queued(p)) {
        /*
         * We were most likely switched from sched_rt, so
         * kick off the schedule if running, otherwise just see
         * if we can still preempt the current task.
         */
        if (rq->curr == p)
            resched_curr(rq);
        else
            check_preempt_curr(rq, p, 0);
    }
}

/* Account for a task changing its policy or group.
 *
 * This routine is mostly called to set cfs_rq->curr field when a task
 * migrates between groups/classes.
 */
static void set_curr_task_fair(struct rq *rq)
{
    struct sched_entity *se = &rq->curr->se;
    struct cfs_rq *cfs_rq = cfs_rq_of(se);

    set_next_entity(cfs_rq, se);
}

void init_cfs_rq(struct cfs_rq *cfs_rq)
{
    cfs_rq->tasks_timeline = RB_ROOT_CACHED;
    cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifndef CONFIG_64BIT
    cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
    raw_spin_lock_init(&cfs_rq->removed.lock);
}

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
{
    struct sched_entity *se = &task->se;
    unsigned int rr_interval = 0;

    /*
     * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
     * idle runqueue:
     */
    if (rq->cfs.load.weight)
        rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));

    return rr_interval;
}

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
    .next			= &idle_sched_class,
    .enqueue_task		= enqueue_task_fair,
    .dequeue_task		= dequeue_task_fair,
    .yield_task		= yield_task_fair,
    .yield_to_task		= yield_to_task_fair,

    .check_preempt_curr	= check_preempt_wakeup,

    .pick_next_task		= pick_next_task_fair,
    .put_prev_task		= put_prev_task_fair,

    .select_task_rq		= select_task_rq_fair,
    .migrate_task_rq	= migrate_task_rq_fair,

    .rq_online		= rq_online_fair,
    .rq_offline		= rq_offline_fair,

    .task_dead		= task_dead_fair,
    .set_cpus_allowed	= set_cpus_allowed_common,

    .set_curr_task          = set_curr_task_fair,
    .task_tick		= task_tick_fair,
    .task_fork		= task_fork_fair,

    .prio_changed		= prio_changed_fair,
    .switched_from		= switched_from_fair,
    .switched_to		= switched_to_fair,

    .get_rr_interval	= get_rr_interval_fair,

    .update_curr		= update_curr_fair,
};

__init void init_sched_fair_class(void)
{
}
